R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

df = read.csv("Traffic_Crashes_250206.csv")
df_filtered = df %>% 
  filter(MOST_SEVERE_INJURY != "REPORTED, NOT EVIDENT")

df_filtered$MOST_SEVERE_INJURY <- factor(df_filtered$MOST_SEVERE_INJURY, 
                                        levels = c("NO INDICATION OF INJURY", 
                                                   "NONINCAPACITATING INJURY", 
                                                   "INCAPACITATING INJURY", 
                                                   "FATAL"), 
                                        ordered = TRUE)
API_key <- "24f32467-3038-4dc5-bde1-a3a7806ec34e"
register_stadiamaps(API_key)
map <- get_stadiamap(c(left = -87.8, bottom = 41.7, 
                       right = -87.6, top = 42), zoom = 12,
                     maptype = "stamen_toner_lite")
ggmap(map)

df_sampled = df_filtered[sample(1000),]


weather =  df_sampled%>%
  filter(WEATHER_CONDITION != "")

ggmap(map) + 
  geom_point(data = weather, 
             aes(x = LONGITUDE, y = LATITUDE, color = WEATHER_CONDITION,size = MOST_SEVERE_INJURY, alpha = 0.7))

df_sampled = df_filtered[sample(1000),]


weather =  df_sampled%>%
  filter(WEATHER_CONDITION != "")

bad_weather =  weather%>%
  filter(WEATHER_CONDITION != "CLEAR")

ggmap(map) + 
  geom_point(data = bad_weather, 
             aes(x = LONGITUDE, y = LATITUDE, color = WEATHER_CONDITION,size = MOST_SEVERE_INJURY, alpha = 0.7))

Rain causes the most severe injury, and most likely to be the incapacitating injury.

df_sampled = df_filtered[sample(1000),]


lighting =  df_sampled%>%
  filter(LIGHTING_CONDITION != "")


ggmap(map) + 
  geom_point(data = lighting, 
             aes(x = LONGITUDE, y = LATITUDE, color = LIGHTING_CONDITION,size = MOST_SEVERE_INJURY, alpha = 0.7))

df_sampled = df_filtered[sample(1000),]


raodcondi =  df_sampled%>%
  filter(ROADWAY_SURFACE_COND != "")


ggmap(map) + 
  geom_point(data = raodcondi, 
             aes(x = LONGITUDE, y = LATITUDE, color = ROADWAY_SURFACE_COND,size = MOST_SEVERE_INJURY, alpha = 0.7))

df_sampled = df_filtered[sample(1000),]


bad_raodcondi =  df_sampled%>%
  filter(ROADWAY_SURFACE_COND != "DRY")


ggmap(map) + 
  geom_point(data = bad_raodcondi, 
             aes(x = LONGITUDE, y = LATITUDE, color = ROADWAY_SURFACE_COND,size = MOST_SEVERE_INJURY, alpha = 0.7))

Primary cause relate to severity

df_sampled = df_filtered[sample(1000),]


prim_cause =  df_sampled%>%
  filter(PRIM_CONTRIBUTORY_CAUSE != "")


ggmap(map) + 
  geom_point(data = prim_cause, 
             aes(x = LONGITUDE, y = LATITUDE, color = PRIM_CONTRIBUTORY_CAUSE,size = MOST_SEVERE_INJURY, alpha = 0.7))+
  theme(legend.text = element_text(size = 4),  # Reduce text size
        legend.title = element_text(size = 5), # Smaller title
        legend.key.size = unit(0.25, "cm"))  # Reduce legend box size

Failing to reduce speed, failing to yield, following too closely are causing more severe cases.

Does speed limit influence severety?

df_sampled = df_filtered[sample(3000),]

df_sampled = df_sampled %>% 
  filter(MOST_SEVERE_INJURY != "NO INDICATION OF INJURY")



speed =  df_sampled%>%
  filter(POSTED_SPEED_LIMIT != "")

ggmap(map) + 
  geom_point(data = speed, 
             aes(x = LONGITUDE, y = LATITUDE, color = MOST_SEVERE_INJURY,size = POSTED_SPEED_LIMIT, alpha = 0.7))

More severe accidents are in higher speed limit roads

Relationship with hit and run

df_sampled = df_filtered[sample(4000),]


hitrun =  df_sampled%>%
  filter(HIT_AND_RUN_I != "")

ggmap(map) + 
  geom_point(data = hitrun, 
             aes(x = LONGITUDE, y = LATITUDE, color = HIT_AND_RUN_I,size = MOST_SEVERE_INJURY, alpha = 0.7))

There are no hit and run for fatal injury

df_recent = read.csv("Traffic_Crashes_250206.csv")
# Removes rows where MOST_SEVERE_INJURY is exactly ""
df <- df_recent[df_recent$MOST_SEVERE_INJURY != "", ]
df <- df[df$MOST_SEVERE_INJURY != "REPORTED, NOT EVIDENT", ]
table(df$MOST_SEVERE_INJURY)
## 
##                    FATAL    INCAPACITATING INJURY  NO INDICATION OF INJURY 
##                      851                    11480                   482399 
## NONINCAPACITATING INJURY 
##                    53008
library(dplyr)

# 1. Define your majority classes
majority_classes <- c(
  "NO INDICATION OF INJURY",
  "NONINCAPACITATING INJURY",
  "INCAPACITATING INJURY"
)

# 2. Split the data into 'majority' and 'minority' subsets
df_majority <- df %>% 
  filter(MOST_SEVERE_INJURY %in% majority_classes)

df_minority <- df %>% 
  filter(!MOST_SEVERE_INJURY %in% majority_classes)
  # i.e., "FATAL" + "INCAPACITATING INJURY"

# 3. Undersample each majority class to a chosen size
#    Adjust 'target_size' to suit your needs.
target_size <- 2000

set.seed(123)  # for reproducibility
df_majority_undersampled <- df_majority %>%
  group_by(MOST_SEVERE_INJURY) %>%
  sample_n(size = target_size, replace = FALSE) %>%
  ungroup()

# 4. Combine the minority subset (kept intact) with the undersampled majority
df_undersampled <- bind_rows(df_minority, df_majority_undersampled)

# 5. Check new distribution
table(df_undersampled$MOST_SEVERE_INJURY)
## 
##                    FATAL    INCAPACITATING INJURY  NO INDICATION OF INJURY 
##                      851                     2000                     2000 
## NONINCAPACITATING INJURY 
##                     2000
library(glmnet)

df_undersampled <- df_undersampled %>% drop_na()  # Remove rows with NA values

# Build the design matrix (X) and outcome (y)
X <- model.matrix(MOST_SEVERE_INJURY ~ POSTED_SPEED_LIMIT +TRAFFIC_CONTROL_DEVICE+DEVICE_CONDITION+WEATHER_CONDITION+LIGHTING_CONDITION+FIRST_CRASH_TYPE+TRAFFICWAY_TYPE+ALIGNMENT+ROADWAY_SURFACE_COND+INTERSECTION_RELATED_I+NOT_RIGHT_OF_WAY_I+HIT_AND_RUN_I+PRIM_CONTRIBUTORY_CAUSE+CRASH_HOUR+CRASH_MONTH+LATITUDE+LONGITUDE, data = df_undersampled)[, -1]
y <- df_undersampled$MOST_SEVERE_INJURY

# Perform cross-validation for multinomial logistic LASSO
cvfit <- cv.glmnet(
  x = X,
  y = y,
  family = "multinomial",       # for multi-class
  type.multinomial = "grouped", # treats coefficients of each class as a group
  alpha = 1,                    # alpha=1 => LASSO penalty
  nfolds = 5                    # 5-fold cross-validation (adjust as needed)
)

# Plot cross-validation curves
plot(cvfit)

# Identify best lambda
best_lambda <- cvfit$lambda.1se
best_lambda
## [1] 0.0261
classnames <- cvfit$glmnet.fit$classnames
y <- factor(y, levels = classnames)  # Ensure correct class alignment

# Refit the final model at best lambda
final_model <- glmnet(
  x = X,
  y = y,
  family = "multinomial",
  alpha = 1,
  lambda = best_lambda
)
# Extract coefficients at best lambda
coef_matrix <- coef(final_model, s = best_lambda)

# Convert to a readable format
coef_list <- lapply(coef_matrix, function(m) as.matrix(m))

# Function to extract only non-zero coefficients
extract_nonzero_coefs <- function(coef_matrix) {
  non_zero_coefs <- coef_matrix[coef_matrix != 0, , drop = FALSE]  # Keep only non-zero coefficients
  return(rownames(non_zero_coefs))  # Return feature names
}

# Get non-zero features for each injury class
important_features <- lapply(coef_list, extract_nonzero_coefs)

# Print only the selected features (without zero coefficients)
important_features
## $FATAL
##  [1] "(Intercept)"                                            
##  [2] "TRAFFIC_CONTROL_DEVICESCHOOL ZONE"                      
##  [3] "LIGHTING_CONDITIONDARKNESS, LIGHTED ROAD"               
##  [4] "LIGHTING_CONDITIONDAYLIGHT"                             
##  [5] "FIRST_CRASH_TYPEFIXED OBJECT"                           
##  [6] "FIRST_CRASH_TYPEPEDESTRIAN"                             
##  [7] "FIRST_CRASH_TYPEREAR END"                               
##  [8] "PRIM_CONTRIBUTORY_CAUSEEXCEEDING AUTHORIZED SPEED LIMIT"
##  [9] "PRIM_CONTRIBUTORY_CAUSEPHYSICAL CONDITION OF DRIVER"    
## [10] "LATITUDE"                                               
## 
## $`INCAPACITATING INJURY`
## [1] "(Intercept)"                              
## [2] "DEVICE_CONDITIONOTHER"                    
## [3] "LIGHTING_CONDITIONDAYLIGHT"               
## [4] "TRAFFICWAY_TYPEDIVIDED - W/MEDIAN BARRIER"
## [5] "TRAFFICWAY_TYPERAMP"                      
## [6] "INTERSECTION_RELATED_IY"                  
## 
## $`NO INDICATION OF INJURY`
##  [1] "(Intercept)"                                                                              
##  [2] "POSTED_SPEED_LIMIT"                                                                       
##  [3] "LIGHTING_CONDITIONDARKNESS, LIGHTED ROAD"                                                 
##  [4] "LIGHTING_CONDITIONUNKNOWN"                                                                
##  [5] "FIRST_CRASH_TYPEFIXED OBJECT"                                                             
##  [6] "FIRST_CRASH_TYPEHEAD ON"                                                                  
##  [7] "FIRST_CRASH_TYPEPARKED MOTOR VEHICLE"                                                     
##  [8] "FIRST_CRASH_TYPEPEDALCYCLIST"                                                             
##  [9] "FIRST_CRASH_TYPEPEDESTRIAN"                                                               
## [10] "FIRST_CRASH_TYPESIDESWIPE SAME DIRECTION"                                                 
## [11] "TRAFFICWAY_TYPEDIVIDED - W/MEDIAN BARRIER"                                                
## [12] "TRAFFICWAY_TYPEONE-WAY"                                                                   
## [13] "TRAFFICWAY_TYPEPARKING LOT"                                                               
## [14] "INTERSECTION_RELATED_IY"                                                                  
## [15] "HIT_AND_RUN_IY"                                                                           
## [16] "PRIM_CONTRIBUTORY_CAUSEDISREGARDING TRAFFIC SIGNALS"                                      
## [17] "PRIM_CONTRIBUTORY_CAUSEEXCEEDING AUTHORIZED SPEED LIMIT"                                  
## [18] "PRIM_CONTRIBUTORY_CAUSEFOLLOWING TOO CLOSELY"                                             
## [19] "PRIM_CONTRIBUTORY_CAUSEIMPROPER BACKING"                                                  
## [20] "PRIM_CONTRIBUTORY_CAUSEIMPROPER OVERTAKING/PASSING"                                       
## [21] "PRIM_CONTRIBUTORY_CAUSEPHYSICAL CONDITION OF DRIVER"                                      
## [22] "PRIM_CONTRIBUTORY_CAUSETURNING RIGHT ON RED"                                              
## [23] "PRIM_CONTRIBUTORY_CAUSEUNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)"
## [24] "LATITUDE"                                                                                 
## 
## $`NONINCAPACITATING INJURY`
## [1] "(Intercept)"             "FIRST_CRASH_TYPETURNING"

###logestic regression

# Corrected feature names based on dataset
selected_features <- c(
  "POSTED_SPEED_LIMIT", 
  "LIGHTING_CONDITION", 
  "PRIM_CONTRIBUTORY_CAUSE", 
  "DEVICE_CONDITION", 
  "TRAFFICWAY_TYPE", 
  "INTERSECTION_RELATED_I"
)

# Select only these features from df_undersampled
df_final <- df_undersampled %>%
  select(all_of(selected_features), MOST_SEVERE_INJURY) %>%
  drop_na()  # Remove any missing values
library(nnet)
library(caret)

# Ensure MOST_SEVERE_INJURY is a factor
df_final$MOST_SEVERE_INJURY <- as.factor(df_final$MOST_SEVERE_INJURY)

# Split into train and test sets
set.seed(42)
train_index <- createDataPartition(df_final$MOST_SEVERE_INJURY, p = 0.8, list = FALSE)
train_data <- df_final[train_index, ]
test_data <- df_final[-train_index, ]

# Train multinomial logistic regression model
logistic_model <- multinom(MOST_SEVERE_INJURY ~ ., data = train_data)
## # weights:  220 (162 variable)
## initial  value 1465.313140 
## iter  10 value 1306.466809
## iter  20 value 1213.310229
## iter  30 value 1203.513573
## iter  40 value 1198.462365
## iter  50 value 1196.637961
## iter  60 value 1195.891750
## iter  70 value 1195.598511
## iter  80 value 1195.477049
## iter  90 value 1195.449429
## iter 100 value 1195.431957
## final  value 1195.431957 
## stopped after 100 iterations
# Summary of model
summary(logistic_model)
## Call:
## multinom(formula = MOST_SEVERE_INJURY ~ ., data = train_data)
## 
## Coefficients:
##                          (Intercept) POSTED_SPEED_LIMIT
## INCAPACITATING INJURY           2.16            -0.0107
## NO INDICATION OF INJURY        -6.75            -0.0303
## NONINCAPACITATING INJURY        7.36             0.0354
##                          LIGHTING_CONDITIONDARKNESS, LIGHTED ROAD
## INCAPACITATING INJURY                                      -0.806
## NO INDICATION OF INJURY                                    -1.116
## NONINCAPACITATING INJURY                                   -0.270
##                          LIGHTING_CONDITIONDAWN LIGHTING_CONDITIONDAYLIGHT
## INCAPACITATING INJURY                     -1.15                    -0.0235
## NO INDICATION OF INJURY                   -1.11                    -0.3311
## NONINCAPACITATING INJURY                  -1.28                    -0.0825
##                          LIGHTING_CONDITIONDUSK LIGHTING_CONDITIONUNKNOWN
## INCAPACITATING INJURY                    0.0982                    -1.299
## NO INDICATION OF INJURY                  0.4224                     0.495
## NONINCAPACITATING INJURY                 0.3986                    -1.676
##                          PRIM_CONTRIBUTORY_CAUSEDISREGARDING OTHER TRAFFIC SIGNS
## INCAPACITATING INJURY                                                     -0.776
## NO INDICATION OF INJURY                                                  -22.688
## NONINCAPACITATING INJURY                                                 -28.779
##                          PRIM_CONTRIBUTORY_CAUSEDISREGARDING ROAD MARKINGS
## INCAPACITATING INJURY                                                17.10
## NO INDICATION OF INJURY                                              -4.90
## NONINCAPACITATING INJURY                                             -6.83
##                          PRIM_CONTRIBUTORY_CAUSEDISREGARDING STOP SIGN
## INCAPACITATING INJURY                                             1.31
## NO INDICATION OF INJURY                                           1.01
## NONINCAPACITATING INJURY                                         -6.45
##                          PRIM_CONTRIBUTORY_CAUSEDISREGARDING TRAFFIC SIGNALS
## INCAPACITATING INJURY                                                 -0.103
## NO INDICATION OF INJURY                                               -1.013
## NONINCAPACITATING INJURY                                              -7.263
##                          PRIM_CONTRIBUTORY_CAUSEDISREGARDING YIELD SIGN
## INCAPACITATING INJURY                                             15.94
## NO INDICATION OF INJURY                                           -4.81
## NONINCAPACITATING INJURY                                          -7.36
##                          PRIM_CONTRIBUTORY_CAUSEDISTRACTION - FROM INSIDE VEHICLE
## INCAPACITATING INJURY                                                       -1.21
## NO INDICATION OF INJURY                                                    -21.48
## NONINCAPACITATING INJURY                                                    -7.66
##                          PRIM_CONTRIBUTORY_CAUSEDISTRACTION - FROM OUTSIDE VEHICLE
## INCAPACITATING INJURY                                                       -0.370
## NO INDICATION OF INJURY                                                     -0.483
## NONINCAPACITATING INJURY                                                    -8.212
##                          PRIM_CONTRIBUTORY_CAUSEDRIVING ON WRONG SIDE/WRONG WAY
## INCAPACITATING INJURY                                                    -0.346
## NO INDICATION OF INJURY                                                  -1.859
## NONINCAPACITATING INJURY                                                 -8.666
##                          PRIM_CONTRIBUTORY_CAUSEDRIVING SKILLS/KNOWLEDGE/EXPERIENCE
## INCAPACITATING INJURY                                                          19.3
## NO INDICATION OF INJURY                                                        19.9
## NONINCAPACITATING INJURY                                                       11.8
##                          PRIM_CONTRIBUTORY_CAUSEEQUIPMENT - VEHICLE CONDITION
## INCAPACITATING INJURY                                                   13.55
## NO INDICATION OF INJURY                                                 15.75
## NONINCAPACITATING INJURY                                                 7.57
##                          PRIM_CONTRIBUTORY_CAUSEEXCEEDING AUTHORIZED SPEED LIMIT
## INCAPACITATING INJURY                                                      -1.52
## NO INDICATION OF INJURY                                                    -3.18
## NONINCAPACITATING INJURY                                                   -9.31
##                          PRIM_CONTRIBUTORY_CAUSEEXCEEDING SAFE SPEED FOR CONDITIONS
## INCAPACITATING INJURY                                                        -0.178
## NO INDICATION OF INJURY                                                       0.800
## NONINCAPACITATING INJURY                                                     -8.992
##                          PRIM_CONTRIBUTORY_CAUSEFAILING TO REDUCE SPEED TO AVOID CRASH
## INCAPACITATING INJURY                                                         -0.06857
## NO INDICATION OF INJURY                                                        0.00273
## NONINCAPACITATING INJURY                                                      -7.12365
##                          PRIM_CONTRIBUTORY_CAUSEFAILING TO YIELD RIGHT-OF-WAY
## INCAPACITATING INJURY                                                   0.702
## NO INDICATION OF INJURY                                                 1.111
## NONINCAPACITATING INJURY                                               -6.450
##                          PRIM_CONTRIBUTORY_CAUSEFOLLOWING TOO CLOSELY
## INCAPACITATING INJURY                                            1.65
## NO INDICATION OF INJURY                                          2.95
## NONINCAPACITATING INJURY                                        -4.96
##                          PRIM_CONTRIBUTORY_CAUSEHAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE)
## INCAPACITATING INJURY                                                                     15.62
## NO INDICATION OF INJURY                                                                   -7.77
## NONINCAPACITATING INJURY                                                                   8.49
##                          PRIM_CONTRIBUTORY_CAUSEIMPROPER BACKING
## INCAPACITATING INJURY                                       13.7
## NO INDICATION OF INJURY                                     16.8
## NONINCAPACITATING INJURY                                     7.5
##                          PRIM_CONTRIBUTORY_CAUSEIMPROPER LANE USAGE
## INCAPACITATING INJURY                                         0.814
## NO INDICATION OF INJURY                                       2.419
## NONINCAPACITATING INJURY                                     -6.069
##                          PRIM_CONTRIBUTORY_CAUSEIMPROPER OVERTAKING/PASSING
## INCAPACITATING INJURY                                                  1.13
## NO INDICATION OF INJURY                                                2.50
## NONINCAPACITATING INJURY                                              -6.37
##                          PRIM_CONTRIBUTORY_CAUSEIMPROPER TURNING/NO SIGNAL
## INCAPACITATING INJURY                                                0.813
## NO INDICATION OF INJURY                                              1.950
## NONINCAPACITATING INJURY                                            -6.320
##                          PRIM_CONTRIBUTORY_CAUSENOT APPLICABLE
## INCAPACITATING INJURY                                   -0.810
## NO INDICATION OF INJURY                                 -0.406
## NONINCAPACITATING INJURY                                -8.296
##                          PRIM_CONTRIBUTORY_CAUSEOPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER
## INCAPACITATING INJURY                                                                                                     -0.345
## NO INDICATION OF INJURY                                                                                                   -1.275
## NONINCAPACITATING INJURY                                                                                                  -7.780
##                          PRIM_CONTRIBUTORY_CAUSEPHYSICAL CONDITION OF DRIVER
## INCAPACITATING INJURY                                                  -1.15
## NO INDICATION OF INJURY                                                -2.45
## NONINCAPACITATING INJURY                                               -8.38
##                          PRIM_CONTRIBUTORY_CAUSEROAD ENGINEERING/SURFACE/MARKING DEFECTS
## INCAPACITATING INJURY                                                              -6.18
## NO INDICATION OF INJURY                                                            17.57
## NONINCAPACITATING INJURY                                                           -6.93
##                          PRIM_CONTRIBUTORY_CAUSETEXTING
## INCAPACITATING INJURY                             -6.09
## NO INDICATION OF INJURY                           -5.12
## NONINCAPACITATING INJURY                          15.73
##                          PRIM_CONTRIBUTORY_CAUSETURNING RIGHT ON RED
## INCAPACITATING INJURY                                          -6.87
## NO INDICATION OF INJURY                                        18.44
## NONINCAPACITATING INJURY                                       -6.82
##                          PRIM_CONTRIBUTORY_CAUSEUNABLE TO DETERMINE
## INCAPACITATING INJURY                                        -0.373
## NO INDICATION OF INJURY                                       0.224
## NONINCAPACITATING INJURY                                     -7.761
##                          PRIM_CONTRIBUTORY_CAUSEUNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)
## INCAPACITATING INJURY                                                                                       -0.181
## NO INDICATION OF INJURY                                                                                    -17.670
## NONINCAPACITATING INJURY                                                                                    -7.683
##                          PRIM_CONTRIBUTORY_CAUSEVISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)
## INCAPACITATING INJURY                                                                         -0.188
## NO INDICATION OF INJURY                                                                       -0.440
## NONINCAPACITATING INJURY                                                                      -7.760
##                          PRIM_CONTRIBUTORY_CAUSEWEATHER
## INCAPACITATING INJURY                              17.9
## NO INDICATION OF INJURY                            18.5
## NONINCAPACITATING INJURY                           11.5
##                          DEVICE_CONDITIONFUNCTIONING PROPERLY
## INCAPACITATING INJURY                                 -0.0377
## NO INDICATION OF INJURY                                9.2468
## NONINCAPACITATING INJURY                              -0.3784
##                          DEVICE_CONDITIONNO CONTROLS
## INCAPACITATING INJURY                          -0.23
## NO INDICATION OF INJURY                         8.79
## NONINCAPACITATING INJURY                       -0.57
##                          DEVICE_CONDITIONNOT FUNCTIONING DEVICE_CONDITIONOTHER
## INCAPACITATING INJURY                             -0.191                  16.1
## NO INDICATION OF INJURY                            8.511                  22.4
## NONINCAPACITATING INJURY                         -21.656                  15.1
##                          DEVICE_CONDITIONUNKNOWN
## INCAPACITATING INJURY                     0.0689
## NO INDICATION OF INJURY                   8.9131
## NONINCAPACITATING INJURY                 -0.1265
##                          TRAFFICWAY_TYPECENTER TURN LANE
## INCAPACITATING INJURY                             -0.926
## NO INDICATION OF INJURY                           -1.799
## NONINCAPACITATING INJURY                          -0.557
##                          TRAFFICWAY_TYPEDIVIDED - W/MEDIAN (NOT RAISED)
## INCAPACITATING INJURY                                            -0.557
## NO INDICATION OF INJURY                                          -0.198
## NONINCAPACITATING INJURY                                          0.407
##                          TRAFFICWAY_TYPEDIVIDED - W/MEDIAN BARRIER
## INCAPACITATING INJURY                                       -0.136
## NO INDICATION OF INJURY                                     -0.746
## NONINCAPACITATING INJURY                                    -0.365
##                          TRAFFICWAY_TYPEDRIVEWAY TRAFFICWAY_TYPENOT DIVIDED
## INCAPACITATING INJURY                      -12.1                     -0.393
## NO INDICATION OF INJURY                     14.3                      0.198
## NONINCAPACITATING INJURY                    15.0                      0.523
##                          TRAFFICWAY_TYPEONE-WAY TRAFFICWAY_TYPEOTHER
## INCAPACITATING INJURY                   -0.7291               0.1597
## NO INDICATION OF INJURY                  0.5823              -0.0841
## NONINCAPACITATING INJURY                 0.0056               0.5888
##                          TRAFFICWAY_TYPEPARKING LOT TRAFFICWAY_TYPERAMP
## INCAPACITATING INJURY                        -0.415               22.55
## NO INDICATION OF INJURY                       1.811               -3.97
## NONINCAPACITATING INJURY                      1.394               -4.88
##                          TRAFFICWAY_TYPEUNKNOWN INTERSECTION_RELATED_IN
## INCAPACITATING INJURY                     0.229                  -1.667
## NO INDICATION OF INJURY                  -0.453                  -1.101
## NONINCAPACITATING INJURY                  1.530                  -0.738
##                          INTERSECTION_RELATED_IY
## INCAPACITATING INJURY                      0.202
## NO INDICATION OF INJURY                   -0.698
## NONINCAPACITATING INJURY                   0.123
## 
## Std. Errors:
##                          (Intercept) POSTED_SPEED_LIMIT
## INCAPACITATING INJURY           1.94             0.0249
## NO INDICATION OF INJURY         1.22             0.0250
## NONINCAPACITATING INJURY        2.20             0.0276
##                          LIGHTING_CONDITIONDARKNESS, LIGHTED ROAD
## INCAPACITATING INJURY                                       0.658
## NO INDICATION OF INJURY                                     0.679
## NONINCAPACITATING INJURY                                    0.675
##                          LIGHTING_CONDITIONDAWN LIGHTING_CONDITIONDAYLIGHT
## INCAPACITATING INJURY                     0.875                      0.656
## NO INDICATION OF INJURY                   0.898                      0.675
## NONINCAPACITATING INJURY                  0.918                      0.677
##                          LIGHTING_CONDITIONDUSK LIGHTING_CONDITIONUNKNOWN
## INCAPACITATING INJURY                      1.04                      1.21
## NO INDICATION OF INJURY                    1.04                      1.05
## NONINCAPACITATING INJURY                   1.05                      1.37
##                          PRIM_CONTRIBUTORY_CAUSEDISREGARDING OTHER TRAFFIC SIGNS
## INCAPACITATING INJURY                                                   1.26e+00
## NO INDICATION OF INJURY                                                 1.47e-09
## NONINCAPACITATING INJURY                                                1.35e-08
##                          PRIM_CONTRIBUTORY_CAUSEDISREGARDING ROAD MARKINGS
## INCAPACITATING INJURY                                             7.32e-08
## NO INDICATION OF INJURY                                           3.30e-10
## NONINCAPACITATING INJURY                                          7.28e-08
##                          PRIM_CONTRIBUTORY_CAUSEDISREGARDING STOP SIGN
## INCAPACITATING INJURY                                             1.03
## NO INDICATION OF INJURY                                           1.09
## NONINCAPACITATING INJURY                                          1.05
##                          PRIM_CONTRIBUTORY_CAUSEDISREGARDING TRAFFIC SIGNALS
## INCAPACITATING INJURY                                                  0.484
## NO INDICATION OF INJURY                                                0.653
## NONINCAPACITATING INJURY                                               0.487
##                          PRIM_CONTRIBUTORY_CAUSEDISREGARDING YIELD SIGN
## INCAPACITATING INJURY                                          7.16e-08
## NO INDICATION OF INJURY                                        5.16e-10
## NONINCAPACITATING INJURY                                       6.23e-08
##                          PRIM_CONTRIBUTORY_CAUSEDISTRACTION - FROM INSIDE VEHICLE
## INCAPACITATING INJURY                                                    9.19e-01
## NO INDICATION OF INJURY                                                  9.69e-09
## NONINCAPACITATING INJURY                                                 8.00e-01
##                          PRIM_CONTRIBUTORY_CAUSEDISTRACTION - FROM OUTSIDE VEHICLE
## INCAPACITATING INJURY                                                        0.870
## NO INDICATION OF INJURY                                                      0.959
## NONINCAPACITATING INJURY                                                     1.002
##                          PRIM_CONTRIBUTORY_CAUSEDRIVING ON WRONG SIDE/WRONG WAY
## INCAPACITATING INJURY                                                     0.619
## NO INDICATION OF INJURY                                                   1.077
## NONINCAPACITATING INJURY                                                  0.743
##                          PRIM_CONTRIBUTORY_CAUSEDRIVING SKILLS/KNOWLEDGE/EXPERIENCE
## INCAPACITATING INJURY                                                         0.292
## NO INDICATION OF INJURY                                                       0.283
## NONINCAPACITATING INJURY                                                      0.318
##                          PRIM_CONTRIBUTORY_CAUSEEQUIPMENT - VEHICLE CONDITION
## INCAPACITATING INJURY                                                   0.795
## NO INDICATION OF INJURY                                                 0.641
## NONINCAPACITATING INJURY                                                0.620
##                          PRIM_CONTRIBUTORY_CAUSEEXCEEDING AUTHORIZED SPEED LIMIT
## INCAPACITATING INJURY                                                      0.488
## NO INDICATION OF INJURY                                                    1.016
## NONINCAPACITATING INJURY                                                   0.589
##                          PRIM_CONTRIBUTORY_CAUSEEXCEEDING SAFE SPEED FOR CONDITIONS
## INCAPACITATING INJURY                                                         0.854
## NO INDICATION OF INJURY                                                       0.827
## NONINCAPACITATING INJURY                                                      1.204
##                          PRIM_CONTRIBUTORY_CAUSEFAILING TO REDUCE SPEED TO AVOID CRASH
## INCAPACITATING INJURY                                                            0.432
## NO INDICATION OF INJURY                                                          0.458
## NONINCAPACITATING INJURY                                                         0.427
##                          PRIM_CONTRIBUTORY_CAUSEFAILING TO YIELD RIGHT-OF-WAY
## INCAPACITATING INJURY                                                   0.411
## NO INDICATION OF INJURY                                                 0.419
## NONINCAPACITATING INJURY                                                0.415
##                          PRIM_CONTRIBUTORY_CAUSEFOLLOWING TOO CLOSELY
## INCAPACITATING INJURY                                           1.014
## NO INDICATION OF INJURY                                         0.992
## NONINCAPACITATING INJURY                                        1.000
##                          PRIM_CONTRIBUTORY_CAUSEHAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE)
## INCAPACITATING INJURY                                                                  7.21e-01
## NO INDICATION OF INJURY                                                                4.55e-11
## NONINCAPACITATING INJURY                                                               7.21e-01
##                          PRIM_CONTRIBUTORY_CAUSEIMPROPER BACKING
## INCAPACITATING INJURY                                      0.712
## NO INDICATION OF INJURY                                    0.474
## NONINCAPACITATING INJURY                                   0.580
##                          PRIM_CONTRIBUTORY_CAUSEIMPROPER LANE USAGE
## INCAPACITATING INJURY                                          1.06
## NO INDICATION OF INJURY                                        1.01
## NONINCAPACITATING INJURY                                       1.05
##                          PRIM_CONTRIBUTORY_CAUSEIMPROPER OVERTAKING/PASSING
## INCAPACITATING INJURY                                                  1.04
## NO INDICATION OF INJURY                                                1.01
## NONINCAPACITATING INJURY                                               1.06
##                          PRIM_CONTRIBUTORY_CAUSEIMPROPER TURNING/NO SIGNAL
## INCAPACITATING INJURY                                                0.758
## NO INDICATION OF INJURY                                              0.745
## NONINCAPACITATING INJURY                                             0.759
##                          PRIM_CONTRIBUTORY_CAUSENOT APPLICABLE
## INCAPACITATING INJURY                                    0.485
## NO INDICATION OF INJURY                                  0.475
## NONINCAPACITATING INJURY                                 0.518
##                          PRIM_CONTRIBUTORY_CAUSEOPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER
## INCAPACITATING INJURY                                                                                                      0.642
## NO INDICATION OF INJURY                                                                                                    0.782
## NONINCAPACITATING INJURY                                                                                                   0.667
##                          PRIM_CONTRIBUTORY_CAUSEPHYSICAL CONDITION OF DRIVER
## INCAPACITATING INJURY                                                  0.449
## NO INDICATION OF INJURY                                                0.665
## NONINCAPACITATING INJURY                                               0.456
##                          PRIM_CONTRIBUTORY_CAUSEROAD ENGINEERING/SURFACE/MARKING DEFECTS
## INCAPACITATING INJURY                                                                NaN
## NO INDICATION OF INJURY                                                         4.92e-08
## NONINCAPACITATING INJURY                                                        6.64e-08
##                          PRIM_CONTRIBUTORY_CAUSETEXTING
## INCAPACITATING INJURY                          3.52e-11
## NO INDICATION OF INJURY                        5.62e-13
## NONINCAPACITATING INJURY                       1.63e-10
##                          PRIM_CONTRIBUTORY_CAUSETURNING RIGHT ON RED
## INCAPACITATING INJURY                                       8.33e-11
## NO INDICATION OF INJURY                                     4.00e-08
## NONINCAPACITATING INJURY                                    5.05e-08
##                          PRIM_CONTRIBUTORY_CAUSEUNABLE TO DETERMINE
## INCAPACITATING INJURY                                         0.285
## NO INDICATION OF INJURY                                       0.283
## NONINCAPACITATING INJURY                                      0.292
##                          PRIM_CONTRIBUTORY_CAUSEUNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)
## INCAPACITATING INJURY                                                                                     6.08e-01
## NO INDICATION OF INJURY                                                                                   2.72e-07
## NONINCAPACITATING INJURY                                                                                  6.23e-01
##                          PRIM_CONTRIBUTORY_CAUSEVISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)
## INCAPACITATING INJURY                                                                           1.14
## NO INDICATION OF INJURY                                                                         1.36
## NONINCAPACITATING INJURY                                                                        1.19
##                          PRIM_CONTRIBUTORY_CAUSEWEATHER
## INCAPACITATING INJURY                             0.415
## NO INDICATION OF INJURY                           0.413
## NONINCAPACITATING INJURY                          0.380
##                          DEVICE_CONDITIONFUNCTIONING PROPERLY
## INCAPACITATING INJURY                                    1.41
## NO INDICATION OF INJURY                                  0.49
## NONINCAPACITATING INJURY                                 1.45
##                          DEVICE_CONDITIONNO CONTROLS
## INCAPACITATING INJURY                           1.43
## NO INDICATION OF INJURY                         0.47
## NONINCAPACITATING INJURY                        1.47
##                          DEVICE_CONDITIONNOT FUNCTIONING DEVICE_CONDITIONOTHER
## INCAPACITATING INJURY                           1.90e+00                 0.785
## NO INDICATION OF INJURY                         1.17e+00                 1.084
## NONINCAPACITATING INJURY                        3.68e-08                 0.831
##                          DEVICE_CONDITIONUNKNOWN
## INCAPACITATING INJURY                      1.500
## NO INDICATION OF INJURY                    0.591
## NONINCAPACITATING INJURY                   1.538
##                          TRAFFICWAY_TYPECENTER TURN LANE
## INCAPACITATING INJURY                               1.40
## NO INDICATION OF INJURY                             1.62
## NONINCAPACITATING INJURY                            1.70
##                          TRAFFICWAY_TYPEDIVIDED - W/MEDIAN (NOT RAISED)
## INCAPACITATING INJURY                                              1.26
## NO INDICATION OF INJURY                                            1.35
## NONINCAPACITATING INJURY                                           1.57
##                          TRAFFICWAY_TYPEDIVIDED - W/MEDIAN BARRIER
## INCAPACITATING INJURY                                         1.29
## NO INDICATION OF INJURY                                       1.39
## NONINCAPACITATING INJURY                                      1.61
##                          TRAFFICWAY_TYPEDRIVEWAY TRAFFICWAY_TYPENOT DIVIDED
## INCAPACITATING INJURY                   7.69e-11                       1.24
## NO INDICATION OF INJURY                 8.80e-01                       1.33
## NONINCAPACITATING INJURY                8.80e-01                       1.56
##                          TRAFFICWAY_TYPEONE-WAY TRAFFICWAY_TYPEOTHER
## INCAPACITATING INJURY                      1.26                 1.40
## NO INDICATION OF INJURY                    1.34                 1.53
## NONINCAPACITATING INJURY                   1.58                 1.71
##                          TRAFFICWAY_TYPEPARKING LOT TRAFFICWAY_TYPERAMP
## INCAPACITATING INJURY                          1.73            3.31e-08
## NO INDICATION OF INJURY                        1.69            2.90e-11
## NONINCAPACITATING INJURY                       1.93            3.63e-11
##                          TRAFFICWAY_TYPEUNKNOWN INTERSECTION_RELATED_IN
## INCAPACITATING INJURY                      1.98                   1.203
## NO INDICATION OF INJURY                    1.90                   0.897
## NONINCAPACITATING INJURY                   2.24                   0.853
##                          INTERSECTION_RELATED_IY
## INCAPACITATING INJURY                      0.308
## NO INDICATION OF INJURY                    0.327
## NONINCAPACITATING INJURY                   0.315
## 
## Residual Deviance: 2391 
## AIC: 2715
# Ensure categorical variables in test_data use the same levels as in train_data
for (col in colnames(train_data)) {
  if (is.factor(train_data[[col]])) {
    test_data[[col]] <- factor(test_data[[col]], levels = levels(train_data[[col]]))
  }
}

# Make predictions
predictions <- predict(logistic_model, newdata = test_data)


# Confusion matrix
conf_matrix <- confusionMatrix(predictions, test_data$MOST_SEVERE_INJURY)
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##                           Reference
## Prediction                 FATAL INCAPACITATING INJURY NO INDICATION OF INJURY
##   FATAL                        7                     4                       1
##   INCAPACITATING INJURY       13                    30                       8
##   NO INDICATION OF INJURY     10                    22                      62
##   NONINCAPACITATING INJURY     3                    22                      11
##                           Reference
## Prediction                 NONINCAPACITATING INJURY
##   FATAL                                           2
##   INCAPACITATING INJURY                          31
##   NO INDICATION OF INJURY                        21
##   NONINCAPACITATING INJURY                       15
## 
## Overall Statistics
##                                         
##                Accuracy : 0.435         
##                  95% CI : (0.374, 0.498)
##     No Information Rate : 0.313         
##     P-Value [Acc > NIR] : 2.17e-05      
##                                         
##                   Kappa : 0.206         
##                                         
##  Mcnemar's Test P-Value : 0.000641      
## 
## Statistics by Class:
## 
##                      Class: FATAL Class: INCAPACITATING INJURY
## Sensitivity                0.2121                        0.385
## Specificity                0.9694                        0.717
## Pos Pred Value             0.5000                        0.366
## Neg Pred Value             0.8952                        0.733
## Prevalence                 0.1260                        0.298
## Detection Rate             0.0267                        0.115
## Detection Prevalence       0.0534                        0.313
## Balanced Accuracy          0.5908                        0.551
##                      Class: NO INDICATION OF INJURY
## Sensitivity                                   0.756
## Specificity                                   0.706
## Pos Pred Value                                0.539
## Neg Pred Value                                0.864
## Prevalence                                    0.313
## Detection Rate                                0.237
## Detection Prevalence                          0.439
## Balanced Accuracy                             0.731
##                      Class: NONINCAPACITATING INJURY
## Sensitivity                                   0.2174
## Specificity                                   0.8135
## Pos Pred Value                                0.2941
## Neg Pred Value                                0.7441
## Prevalence                                    0.2634
## Detection Rate                                0.0573
## Detection Prevalence                          0.1947
## Balanced Accuracy                             0.5154
# Accuracy
accuracy <- mean(predictions == test_data$MOST_SEVERE_INJURY)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 43.51 %"

###Random Forest

df_recent = read.csv("Traffic_Crashes_250206.csv")
# Removes rows where MOST_SEVERE_INJURY is exactly ""
df <- df_recent[df_recent$MOST_SEVERE_INJURY != "", ]
df <- df[df$MOST_SEVERE_INJURY != "REPORTED, NOT EVIDENT", ]
table(df$MOST_SEVERE_INJURY)
## 
##                    FATAL    INCAPACITATING INJURY  NO INDICATION OF INJURY 
##                      851                    11480                   482399 
## NONINCAPACITATING INJURY 
##                    53008
library(dplyr)

# 1. Define your majority classes
majority_classes <- c(
  "NO INDICATION OF INJURY",
  "NONINCAPACITATING INJURY",
  "INCAPACITATING INJURY"
)

# 2. Split the data into 'majority' and 'minority' subsets
df_majority <- df %>% 
  filter(MOST_SEVERE_INJURY %in% majority_classes)

df_minority <- df %>% 
  filter(!MOST_SEVERE_INJURY %in% majority_classes)
  # i.e., "FATAL" + "INCAPACITATING INJURY"

# 3. Undersample each majority class to a chosen size
#    Adjust 'target_size' to suit your needs.
target_size <- 2000

set.seed(123)  # for reproducibility
df_majority_undersampled <- df_majority %>%
  group_by(MOST_SEVERE_INJURY) %>%
  sample_n(size = target_size, replace = FALSE) %>%
  ungroup()

# 4. Combine the minority subset (kept intact) with the undersampled majority
df_undersampled <- bind_rows(df_minority, df_majority_undersampled)

# 5. Check new distribution
table(df_undersampled$MOST_SEVERE_INJURY)
## 
##                    FATAL    INCAPACITATING INJURY  NO INDICATION OF INJURY 
##                      851                     2000                     2000 
## NONINCAPACITATING INJURY 
##                     2000

lasso

library(glmnet)

df_undersampled$MOST_SEVERE_INJURY <- as.factor(df_undersampled$MOST_SEVERE_INJURY)

colSums(is.na(df_undersampled))
##               CRASH_RECORD_ID              CRASH_DATE_EST_I 
##                             0                             0 
##                    CRASH_DATE            POSTED_SPEED_LIMIT 
##                             0                             0 
##        TRAFFIC_CONTROL_DEVICE              DEVICE_CONDITION 
##                             0                             0 
##             WEATHER_CONDITION            LIGHTING_CONDITION 
##                             0                             0 
##              FIRST_CRASH_TYPE               TRAFFICWAY_TYPE 
##                             0                             0 
##                      LANE_CNT                     ALIGNMENT 
##                          5526                             0 
##          ROADWAY_SURFACE_COND                   ROAD_DEFECT 
##                             0                             0 
##                   REPORT_TYPE                    CRASH_TYPE 
##                             0                             0 
##        INTERSECTION_RELATED_I            NOT_RIGHT_OF_WAY_I 
##                             0                             0 
##                 HIT_AND_RUN_I                        DAMAGE 
##                             0                             0 
##          DATE_POLICE_NOTIFIED       PRIM_CONTRIBUTORY_CAUSE 
##                             0                             0 
##        SEC_CONTRIBUTORY_CAUSE                     STREET_NO 
##                             0                             0 
##              STREET_DIRECTION                   STREET_NAME 
##                             0                             0 
##            BEAT_OF_OCCURRENCE                PHOTOS_TAKEN_I 
##                             0                             0 
##            STATEMENTS_TAKEN_I                     DOORING_I 
##                             0                             0 
##                   WORK_ZONE_I                WORK_ZONE_TYPE 
##                             0                             0 
##             WORKERS_PRESENT_I                     NUM_UNITS 
##                             0                             0 
##            MOST_SEVERE_INJURY                INJURIES_TOTAL 
##                             0                             0 
##                INJURIES_FATAL       INJURIES_INCAPACITATING 
##                             0                             0 
##   INJURIES_NON_INCAPACITATING INJURIES_REPORTED_NOT_EVIDENT 
##                             0                             0 
##        INJURIES_NO_INDICATION              INJURIES_UNKNOWN 
##                             0                             0 
##                    CRASH_HOUR             CRASH_DAY_OF_WEEK 
##                             0                             0 
##                   CRASH_MONTH                      LATITUDE 
##                             0                            40 
##                     LONGITUDE                      LOCATION 
##                            40                             0
df_undersampled <- as.data.frame(df_undersampled)


df_undersampled <- df_undersampled %>% na.omit()
# Convert categorical variables into factors
df_undersampled$MOST_SEVERE_INJURY <- as.factor(df_undersampled$MOST_SEVERE_INJURY)

# Create X matrix AFTER removing NAs
X <- model.matrix(MOST_SEVERE_INJURY~POSTED_SPEED_LIMIT +TRAFFIC_CONTROL_DEVICE+DEVICE_CONDITION+WEATHER_CONDITION+LIGHTING_CONDITION+FIRST_CRASH_TYPE+TRAFFICWAY_TYPE+ALIGNMENT+ROADWAY_SURFACE_COND+INTERSECTION_RELATED_I+NOT_RIGHT_OF_WAY_I+HIT_AND_RUN_I+PRIM_CONTRIBUTORY_CAUSE+CRASH_HOUR+CRASH_MONTH+LATITUDE+LONGITUDE, data = df_undersampled)[, -1]

# Ensure y matches X in row count
y <- df_undersampled$MOST_SEVERE_INJURY

# Check if X and y have the same number of rows
nrow(X) == length(y)  # Should return TRUE
## [1] TRUE
# Perform cross-validation for multinomial logistic LASSO
cvfit <- cv.glmnet(
  x = X,
  y = y,
  family = "multinomial",       # for multi-class
  type.multinomial = "grouped", # treats coefficients of each class as a group
  alpha = 1,                    # alpha=1 => LASSO penalty
  nfolds = 5                    # 5-fold cross-validation (adjust as needed)
)

# Plot cross-validation curves
plot(cvfit)

# Identify best lambda
best_lambda <- cvfit$lambda.min
best_lambda
## [1] 0.018
# Convert y to a Proper Factor Before Training
y <- as.factor(y)
classnames <- levels(y)

# Refit the final model at best lambda
final_model <- glmnet(
  x = X,
  y = y,
  family = "multinomial",
  alpha = 1,
  lambda = cvfit$lambda.1se
)
coef.1se <- coef(final_model, s = "lambda.1se")  # Get coefficients at lambda.1se

# Step 6: Retrieve the names of the selected variables
var.1se <- rownames(as.matrix(coef.1se))[-1]  # Extract variable names, excluding intercept
var.1se
## [1] "INCAPACITATING INJURY"    "NO INDICATION OF INJURY" 
## [3] "NONINCAPACITATING INJURY"

random forest

library(randomForest)
library(caret)

set.seed(123)  # for reproducibility
train_index <- createDataPartition(df_undersampled$MOST_SEVERE_INJURY, p = 0.8, list = FALSE)

df_train <- df_undersampled[train_index, ]
df_test  <- df_undersampled[-train_index, ]

table(df_train$MOST_SEVERE_INJURY)
## 
##                    FATAL    INCAPACITATING INJURY  NO INDICATION OF INJURY 
##                      132                      315                      331 
## NONINCAPACITATING INJURY 
##                      279
table(df_test$MOST_SEVERE_INJURY)
## 
##                    FATAL    INCAPACITATING INJURY  NO INDICATION OF INJURY 
##                       33                       78                       82 
## NONINCAPACITATING INJURY 
##                       69
# run the model
set.seed(123)
rf_model <- randomForest(
  MOST_SEVERE_INJURY ~ POSTED_SPEED_LIMIT +TRAFFIC_CONTROL_DEVICE+DEVICE_CONDITION+WEATHER_CONDITION+LIGHTING_CONDITION+FIRST_CRASH_TYPE+TRAFFICWAY_TYPE+ALIGNMENT+ROADWAY_SURFACE_COND+INTERSECTION_RELATED_I+NOT_RIGHT_OF_WAY_I+HIT_AND_RUN_I+PRIM_CONTRIBUTORY_CAUSE+CRASH_HOUR+CRASH_MONTH+LATITUDE+LONGITUDE,   # target ~ all other columns
  data = df_train,
  ntree = 100,              # number of trees
  mtry = 15,              # if NULL, sqrt(#predictors) for classification
  importance = TRUE         # track variable importance
)

# Print summary of the model
print(rf_model)
## 
## Call:
##  randomForest(formula = MOST_SEVERE_INJURY ~ POSTED_SPEED_LIMIT +      TRAFFIC_CONTROL_DEVICE + DEVICE_CONDITION + WEATHER_CONDITION +      LIGHTING_CONDITION + FIRST_CRASH_TYPE + TRAFFICWAY_TYPE +      ALIGNMENT + ROADWAY_SURFACE_COND + INTERSECTION_RELATED_I +      NOT_RIGHT_OF_WAY_I + HIT_AND_RUN_I + PRIM_CONTRIBUTORY_CAUSE +      CRASH_HOUR + CRASH_MONTH + LATITUDE + LONGITUDE, data = df_train,      ntree = 100, mtry = 15, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 15
## 
##         OOB estimate of  error rate: 56.3%
## Confusion matrix:
##                          FATAL INCAPACITATING INJURY NO INDICATION OF INJURY
## FATAL                       35                    43                      19
## INCAPACITATING INJURY       18                   135                      83
## NO INDICATION OF INJURY      8                    71                     209
## NONINCAPACITATING INJURY    18                   108                      70
##                          NONINCAPACITATING INJURY class.error
## FATAL                                          35       0.735
## INCAPACITATING INJURY                          79       0.571
## NO INDICATION OF INJURY                        43       0.369
## NONINCAPACITATING INJURY                       83       0.703
# Plot error vs. number of trees
plot(rf_model)

# 1. Predict classes
test_preds <- predict(rf_model, newdata = df_test)

# Confusion Matrix with test stats
confusionMatrix(
  data = test_preds,
  reference = df_test$MOST_SEVERE_INJURY
)
## Confusion Matrix and Statistics
## 
##                           Reference
## Prediction                 FATAL INCAPACITATING INJURY NO INDICATION OF INJURY
##   FATAL                        7                     6                       9
##   INCAPACITATING INJURY       12                    42                      26
##   NO INDICATION OF INJURY      3                    13                      36
##   NONINCAPACITATING INJURY    11                    17                      11
##                           Reference
## Prediction                 NONINCAPACITATING INJURY
##   FATAL                                           5
##   INCAPACITATING INJURY                          39
##   NO INDICATION OF INJURY                         6
##   NONINCAPACITATING INJURY                       19
## 
## Overall Statistics
##                                         
##                Accuracy : 0.397         
##                  95% CI : (0.337, 0.459)
##     No Information Rate : 0.313         
##     P-Value [Acc > NIR] : 0.00245       
##                                         
##                   Kappa : 0.167         
##                                         
##  Mcnemar's Test P-Value : 0.00137       
## 
## Statistics by Class:
## 
##                      Class: FATAL Class: INCAPACITATING INJURY
## Sensitivity                0.2121                        0.538
## Specificity                0.9127                        0.582
## Pos Pred Value             0.2593                        0.353
## Neg Pred Value             0.8894                        0.748
## Prevalence                 0.1260                        0.298
## Detection Rate             0.0267                        0.160
## Detection Prevalence       0.1031                        0.454
## Balanced Accuracy          0.5624                        0.560
##                      Class: NO INDICATION OF INJURY
## Sensitivity                                   0.439
## Specificity                                   0.878
## Pos Pred Value                                0.621
## Neg Pred Value                                0.775
## Prevalence                                    0.313
## Detection Rate                                0.137
## Detection Prevalence                          0.221
## Balanced Accuracy                             0.658
##                      Class: NONINCAPACITATING INJURY
## Sensitivity                                   0.2754
## Specificity                                   0.7979
## Pos Pred Value                                0.3276
## Neg Pred Value                                0.7549
## Prevalence                                    0.2634
## Detection Rate                                0.0725
## Detection Prevalence                          0.2214
## Balanced Accuracy                             0.5366
# Suppose your model is called rf_model
importance_matrix <- importance(rf_model)

# Convert to a data frame for plotting
library(dplyr)
# Convert to a data frame for plotting
imp_df <- data.frame(
  Variable = rownames(importance_matrix),
  MeanDecreaseAccuracy = importance_matrix[, "MeanDecreaseAccuracy"]
)

# Sort descending by MeanDecreaseAccuracy and keep top 20
imp_df_top20 <- imp_df %>%
  arrange(desc(MeanDecreaseAccuracy)) %>%
  head(20)

# Plot the variable importance
library(ggplot2)

ggplot(imp_df_top20, 
       aes(x = reorder(Variable, MeanDecreaseAccuracy), 
           y = MeanDecreaseAccuracy)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(
    x = "Variable",
    y = "Mean Decrease Accuracy",
    title = "Top 20 Variables by Random Forest Importance"
  ) +
  theme_minimal()